import numpy as np
import pandas as pd
from datetime import datetime
from glob import glob
from StringIO import StringIO
''' created quick class based on August 1st Analysis '''
from MTAReader import MTAReader
created quick class MTAReader based on August 1st Analysis. Should make notebook easier to follow.
Ideally we could just read in the first and then last file, but I found errors in the first MTA file for august 1st where the odometer counters were reset, so we do indeed need to read in the full files... Or we could potential lose up to N days of readings without ever knowing
Note, know I have to fix the NaN rows, which represent the ~4 hour gap between the last row in the first file and then first row in next file. These files account for a total of 70.4 MB of data, so I decided it is quickest and most plausible to ignore this issue completely at this stage - instead combining the files into one big string. I'm running this on my old Macbook pro and it's fairly quick. Problem is this does not scale if we go to N files, but 80 - 20 rule and the analysis will be the same. For the interactive section SQL should make this much easier, I just think it's a bit too verbose to use in this setting at the moment. Will be sure to compare results here to my webapp, once implemented
Note: Found error on line 26668 in turnstile_130706.txt so I deleted the null characters manually, and this worked. Could use string replacement in below, but I want the data corrected later too.
combined_files = ''
for i_file, file_name in enumerate(glob('data/turnstile_*')):
# if i_file == 1:
# print('Skipping', file_name)
# continue
# if i_file > 0:
# combined_files += '\n'
print('Reading', file_name)
combined_files += open(file_name).read()
# combined_files = open(file_name).read()
print('Len of all files together', len(combined_files))
combined_files_buffer = StringIO(combined_files)
combined_files_buffer
mta = MTAReader('Misc', data_str=combined_files_buffer)
Note, they could be a little off, due to the first of the day now having previous data to compute with
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
all_for_period = pd.read_csv(
'data_sanitized/2013-07-27 - 2013-08-02.csv',
parse_dates=['datetime', 'date'],
date_parser=dateparse
)
august_1_2013 = all_for_period[all_for_period.date == datetime(2013, 8, 1)]
# august_1_2013 = all_for_period[all_for_period.date == '2013-08-01 00:00:00']
august_1_2013.to_csv('august_1_class.csv', index=False)
august_1_total_traffic = pd.Series({
'exits': august_1_2013.exits.sum(),
'entries': august_1_2013.entries.sum()
})
august_1_total_traffic['total'] = august_1_total_traffic.sum()
august_1_total_traffic
august_1_2013_from_big = mta.df[mta.df.date == datetime(2013, 8, 1)]
august_1_total_traffic_big = pd.Series({
'exits': august_1_2013_from_big.exits.sum(),
'entries': august_1_2013_from_big.entries.sum()
})
august_1_total_traffic_big['total'] = august_1_total_traffic_big.sum()
august_1_total_traffic_big
difference_between_results = august_1_total_traffic - august_1_total_traffic_big
difference_between_results
normalized_difference = difference_between_results / august_1_total_traffic
normalized_difference
df_by_station = august_1_2013.groupby(['station', 'scp'])
scp_by_usage = []
for (i_station, ((station, scp), data)) in enumerate(df_by_station):
# if i_station > 2:
# break
scp_by_usage.append({
'scp': scp,
'station': station,
'cummulative_flow': data.cummulative_flow.sum()
})
scp_by_usage = pd.DataFrame(scp_by_usage)
scp_by_usage = scp_by_usage.sort(
['cummulative_flow', 'scp', 'station'],
ascending=False
).reset_index(drop=True)
scp_by_usage[0:10]
df_by_station = august_1_2013_from_big.groupby(['station', 'scp'])
scp_by_usage_big = []
for (i_station, ((station, scp), data)) in enumerate(df_by_station):
# if i_station > 2:
# break
scp_by_usage_big.append({
'scp': scp,
'station': station,
'cummulative_flow': data.cummulative_flow.sum()
})
scp_by_usage_big = pd.DataFrame(scp_by_usage_big)
scp_by_usage_big = scp_by_usage_big.sort(
['cummulative_flow', 'scp', 'station'],
ascending=False
).reset_index(drop=True)
scp_by_usage_big[0:10]
difference_stations = pd.merge(
scp_by_usage,
scp_by_usage_big,
left_on=["scp", "station"],
right_on=["scp", "station"],
how="left"
)
difference_stations['difference'] = difference_stations.cummulative_flow_x - difference_stations.cummulative_flow_y
difference_stations = difference_stations.sort('difference', ascending=False)
difference_stations.difference.sum()
The total counts are slightly off (by < 1%), but the top stations are the same. This is probably due to slight difference between those small stations which have "bad" hours and are inconsistent. This could be because the odometer readings were reset and they picked up additional information from the days previous
%matplotlib inline
mta.df[(mta.df.scp == '00-00-01') & (mta.df.station == '1 AVE')].head().plot(x='datetime', y='entries')
mta.df[(mta.df.scp == '00-00-01') & (mta.df.station == '1 AVE')].plot(x='datetime', y='entries')
test = mta.df[['datetime', 'entries', 'exits', 'station', 'date']]#.plot()#(x='datetime', y='entries')
test.head()
We need monthly data, so split by date and get into dfs for entries and exits
df_by_date = {
'entries': {},
'exits': {},
}
for (i_grouping, ((date, station), data)) in enumerate(test.groupby(['date', 'station'])):
# print(i_grouping, _datetime, data.entries.sum())
if station not in df_by_date['entries']:
df_by_date['entries'][station] = {}
if station not in df_by_date['exits']:
df_by_date['exits'][station] = {}
df_by_date['entries'][station][date] = data.entries.sum()
df_by_date['exits'][station][date] = data.exits.sum()
df_by_date['entries'] = pd.DataFrame(df_by_date['entries'])
df_by_date['exits'] = pd.DataFrame(df_by_date['exits'])
df_by_date['entries'].head()
df_by_date['entries'].plot( figsize=(20,10), legend=False)
OK try with subset of stations...?
df_by_date['entries'].columns.tolist()
quarter_cols = len(df_by_date['entries'].columns.tolist()) / 4
df_by_date['entries'][df_by_date['entries'].columns.tolist()[:quarter_cols]].plot( figsize=(20,10), legend=False)
Pretty cool. we can see week day commuting.
Look we found July 4th!, looks like less people tend to use the subway then. Also notice the black and yellow lines (towards bottom) which follow a different trend than the others - more traffic back on the weekends. I bet these are stations like Cony Island or other vacation like destinations.
Now on to trying to find messed up ones...
df_by_date['entries'][df_by_date['entries'].columns.tolist()[quarter_cols: quarter_cols * 2]].plot( figsize=(20,10), legend=False)
screwed_up_entries = df_by_date['entries'].sum()
screwed_up_entries.sort()
screwed_up_entries_normalized = screwed_up_entries / 1e8 # normalize by key in chart
screwed_up_entries_normalized.sort(ascending=False)
screwed_up_entries_normalized[0:11]
bad_stations_for_entries = screwed_up_entries_normalized[0:11] # 11 chosen via trial and error
bad_stations_for_entries
entry_stations_to_fix = bad_stations_for_entries.index.values.tolist()
bad_stations_for_entries.index.values.tolist()
Now I need to fix the mta dataset, we know the relative date range and the stations.
df_by_date['entries'][bad_stations_for_entries.index.values.tolist()].plot()
df_by_date['entries'][bad_stations_for_entries.index.values.tolist()]
so the bad dates to fix are:
| station | date |
|---|---|
| DYRE AVE | 2013-07-04 |
| 7 AVE | 2013-07-19 |
| INWOOD-207 ST | 2013-07-12 |
| 77 ST | 2013-07-14 |
| FOREST HILLS-71 | 2013-07-08 |
| BEACH 98 ST | 2013-07-09 |
| etc | etc |
(worst offenders)
fixed_entries = df_by_date['entries']
for station in fixed_entries[entry_stations_to_fix]:
print(station, fixed_entries[station].max(), fixed_entries[station].idxmax(), fixed_entries[station].mean())
fixed_entries.xs(fixed_entries[station].idxmax())[station] = 0
# set to mean?
fixed_entries.xs(fixed_entries[station].idxmax())[station] = fixed_entries[station].mean()
fixed_entries
fixed_entries.plot( figsize=(20,10), legend=False)
A few more to fix...
fixed_entries.max().max()
fixed_entries.max().idxmax()
fixed_entries['PAVONIA/NEWPORT']
We did the August 1st analysis assuming it didn't break any trend, may have to go back now...
fixed_entries.xs(datetime(2013, 8, 1))['PAVONIA/NEWPORT'] = 0 # should not matter for July analysis, just for printing a graph
fixed_entries.plot( figsize=(20,10), legend=False)
july_entries = fixed_entries[datetime(2013, 7, 1): datetime(2013, 7, 31)]
july_entries.index
july_entries.head()
assuming similar errors for exits.
df_by_date['exits'].plot( figsize=(20,10), legend=False)
july_exits = df_by_date['exits'][datetime(2013, 7, 1): datetime(2013, 7, 31)]
july_exits.head()
max_exits = july_exits.max()
max_exits.sort(ascending=False)
max_exits[0:15]
max_exits_normalized = max_exits / 1e9
max_exits_normalized.sort(ascending=False)
max_exits_normalized[0:15]
max_exits[0:8]
exit_stations_to_fix = max_exits[0:8].index.values.tolist()
exit_stations_to_fix
We need to fix 0:8
for station in july_exits[exit_stations_to_fix]:
print(station, july_exits[station].max(), july_exits[station].idxmax(), july_exits[station].mean())
july_exits.xs(july_exits[station].idxmax())[station] = 0
# set to mean?
july_exits.xs(july_exits[station].idxmax())[station] = july_exits[station].mean()
july_exits
july_exits.plot( figsize=(20,10), legend=False)
july_exits.max().max()
july_exits.max().idxmax()
july_exits['FOREST HILLS-71']
july_exits.xs(datetime(2013, 7, 8))['FOREST HILLS-71'] = 0
july_exits.plot( figsize=(20,10), legend=False)
july_busyness = july_exits + july_entries
july_exits.head()
july_entries.head()
july_busyness.head()
july_busyness.plot(figsize=(20,10), legend=False)
max_stations = july_busyness.mean()
max_stations.sort(ascending=False)
max_stations.head()
max_stations.tail()
This is consistent with our previous analysis. The stations which have a lot of transfering stations and are located in central areas are the busiest
We will have to reconstruct this using the data above. Good thing is that we know the bad days of data
| Station | day |
|---|---|
| PAVONIA/NEWPORT | 2013-08-01 |
| DYRE AVE | 2013-07-17 |
| 7 AVE | 2013-07-09 |
| INWOOD-207 ST | 2013-07-18 |
| 77 ST | 2013-07-23 |
| FOREST HILLS-71 | 2013-07-11 |
| BEACH 98 ST | 2013-07-16 |
| MARCY AVE | 2013-08-01 |
| 34 ST-PENN STA | 2013-07-22 |
| MORGAN AVE | 2013-07-10 |
| ATLANTIC AVE | 2013-07-18 |
| 42 ST-GRD CNTRL | 2013-07-09 |
| Station | day |
|---|---|
| FOREST HILLS-71 | 2013-07-08 |
| FOREST HILLS-71 | 2013-07-01 |
| DYRE AVE | 2013-07-04 |
| MARCY AVE | 2013-07-10 |
| 7 AVE | 2013-07-19 |
| INWOOD-207 ST | 2013-07-12 |
| ATLANTIC AVE | 2013-07-29 |
| 77 ST | 2013-07-14 |
| EXCHANGE PLACE | 2013-07-30 |
from datetime import timedelta
date_start = datetime(2013, 7, 1)
date_end = datetime(2013, 7, 31)
fridays_in_july = []
current_date = date_start
while current_date <= date_end:
if current_date.strftime("%A") == 'Friday':
print('Friday:', current_date)
fridays_in_july.append(current_date)
# print(current_date, current_date.day, current_date.strftime("%A"))
current_date += timedelta(days=1)
fridays_in_july
Lets see if any of those dates oversect with the ones we knew may have errors...
| date | fix |
|---|---|
| 2013-07-12 | INWOOD-207 ST |
| 2013-07-19 | 7th Ave |
Ok that's manageable. Let's see if it is even possible for these to be among the highest
inwood_data = mta.df[(mta.df.station == 'INWOOD-207 ST') & (mta.df.date == datetime(2013, 7,12))]
inwood_data[
(inwood_data.datetime >= datetime(2013, 7, 12, 0, 0, 0)) & (inwood_data.datetime < datetime(2013, 7, 12, 4, 0, 0))
][['datetime', 'entries', 'exits']]
OK no need to fix any data there
seventh_ave = mta.df[(mta.df.station == '7th Ave') & (mta.df.date == datetime(2013, 7,12))]
seventh_ave[
(seventh_ave.datetime >= datetime(2013, 7, 12, 0, 0, 0)) & (seventh_ave.datetime < datetime(2013, 7, 12, 4, 0, 0))
][['datetime', 'entries', 'exits']]
and no data is relevant there
fridays_in_july
count = 0
for date in fridays_in_july:
count += len(mta.df[mta.df.date == date])
data_for_days = mta.df[mta.df.date.isin(fridays_in_july)]
data_for_days.count().max() == count
data_for_days.date.unique()
data_for_days['hour'] = data_for_days.datetime.apply(lambda x: x.hour)
data_for_days[['hour', 'datetime']].head()
fridays_morning_data = data_for_days[
(data_for_days.hour >= 0) & (data_for_days.hour <= 4)
]
len(fridays_morning_data)
friday_morning_busyness = {}
for (station, hour), data in fridays_morning_data.groupby(['station', 'hour']):
if station not in friday_morning_busyness:
friday_morning_busyness[station] = {}
friday_morning_busyness[station][hour] = data.entries.sum() #+ data.exits.sum()
friday_morning_busyness = pd.DataFrame(friday_morning_busyness)
friday_morning_busyness
Note, remember this data comes in once every ~4 hours, unless there is a maintenance event, so makes sense only 1-2 values per col
friday_morning_busyness.plot(legend=False, figsize=(20,10))
most_busy = friday_morning_busyness.sum() / len(fridays_in_july)
most_busy.sort(ascending=False)
most_busy.head()
Using the "highest avg", 42 ST-TIMES SQ had the highest average number of entries between midnight & 4am on Fridays in July 2013
friday_morning_busyness[most_busy[0:10].index].plot(legend=True, figsize=(20,10))
friday_morning_busyness[most_busy.tail().index].plot(legend=True, figsize=(20,10))
This analysis follows all of the others, as stations that are most entered are central hubs which have many train connections. I will also add these are geographically correlated with many of the busy / happening areas in the city, which makes sense because the MTA should have built stations where people needed to go, and people would move to these areas. Its recurssive!